
# coding: utf-8

# In[1]:


import pandas as pd


# In[2]:


ner = pd.read_excel("NER_GV_Deduplicate Wikifier.xlsx")
media_cloud_tagger = pd.read_excel("GV_News_Tagged_replaced.xlsx")
media_cloud_tagger.drop_duplicates(subset=['Article'],inplace=True)

ner=ner[ner['tag']!='sentence']
# In[3]:


replace_columns = ["Topic %s"%i for i in range(1,21)]


# In[4]:


ner["Lable And Topic 1"] = ner["LABEL"] + '_' + ner["Topic 1"]
bi_ner = ner[["Article","Lable And Topic 1"]]
count_ner = bi_ner.groupby(["Article","Lable And Topic 1"]).size().reset_index()
stack_count_ner = count_ner.set_index(["Article","Lable And Topic 1"]).unstack('Lable And Topic 1')
stack_count_ner.columns = stack_count_ner.columns.get_level_values(1)
stack_count_ner = stack_count_ner.reset_index()
result = pd.merge(media_cloud_tagger,stack_count_ner,on="Article",how='left')
result = result.fillna(0)
cloud_dump = result.copy()
cloud_article_topics = media_cloud_tagger[["Article"]+replace_columns]


# In[5]:


media_cloud600 = pd.read_excel("Media Cloud 600 Topics.xlsx")
categorys_lst = []
with open("categorys.txt") as f:
    categorys_lst = [category.strip('\n').split(',') for category in f.readlines()]
media_cloud600_droped = media_cloud600[media_cloud600["Topic category"]!=99]


# In[6]:


tag2category_dict = {}

for _, row in media_cloud600_droped.iterrows():
    tag = row["tag"]
    category = row["Topic category"]
    tag2category_dict[tag] = category


# In[7]:


category_lst = list(set(tag2category_dict.values()))


# In[8]:


category_lst


# In[9]:


def topic_category(cate):
    return "TOPIC_"+cate


# In[10]:


D = {}
for category in category_lst:
    D[category] = []
D["Unknown"] = []
D["Article"] = []


# In[11]:


def article_topic_invert(row_data):
    article = row_data["Article"]
    topics = list(row_data[1:4])
    unique_topics = list(set(topics))
    for category in category_lst+["Unknown"]:
        if category in unique_topics:
            D[category] += [topics.index(category)+1]
        else:
            D[category] += [0]
    D["Article"] += [article]     


# In[12]:


cloud_article_topics.apply(article_topic_invert,axis=1)


# In[13]:


df2 = pd.DataFrame(D)


# In[14]:


column_name = {column:"TOPIC_"+column for column in df2.columns[:-2]}


# In[15]:


df2.rename(column_name,axis=1,inplace=True)


# In[16]:


merge_df = pd.merge(df2,result,on="Article")


# In[17]:


columns  = sorted(list(merge_df.columns))


# In[18]:


for column in list(media_cloud_tagger.columns):
    columns.remove(column)


# In[19]:


columns = list(media_cloud_tagger.columns) + columns


# In[20]:


merge_df = merge_df[columns]


# In[21]:


from numpy import nan


# In[22]:


merge_df.replace(0,nan,inplace=True)
merge_df.dropna(axis=1,how='all',inplace=True)
merge_df.dropna(subset=list(merge_df.columns[52:]),how='all',inplace=True)
df_list=list(merge_df.columns)


df = merge_df[df_list]
df.replace(nan,0,inplace=True)
df.to_excel('Network_GV_nonSentence_GPE.xlsx')

